{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "363193dd",
   "metadata": {},
   "source": [
    "### Correlate misexpression events with biological and technical covariates "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0a531208",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from scipy.stats import spearmanr\n",
    "from pathlib import Path\n",
    "from statsmodels.stats import multitest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "11192987",
   "metadata": {},
   "outputs": [],
   "source": [
    "# inputs \n",
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "tpm_mtx_inactive_path = wkdir_path.joinpath(\"1_rna_seq_qc/tpm_mtx_inactive/tpm_4568samples_8779genes_inactive.tsv\")\n",
    "zscore_tpm_flat_path =wkdir_path.joinpath(\"1_rna_seq_qc/zscore_tpm_flat/tpm_zscore_4568smpls_8739genes_tpm0.1_frac_5.0perc_flat.csv\")\n",
    "covariates_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/phenotypes/rna_seq/processed_v97/covariates/master/master_covariates_v97_swapd_depth_fastq_rin_cell_sex_pcs_season_batch_fc_pipelines_updtd.tsv\"\n",
    "xcell_enrich_path =wkdir_path.joinpath(\"2_misexp_qc/xcell/xCell_estimates.tsv\")\n",
    "out_dir = wkdir_path.joinpath(\"2_misexp_qc/misexp_gene_cov_corr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ca25db5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# variables \n",
    "spearman_rho_cutoff = 0.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c5280891",
   "metadata": {},
   "outputs": [],
   "source": [
    "out_dir_path = Path(out_dir)\n",
    "out_dir_path.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4f71adb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load TPM matrix of inactive genes \n",
    "tpm_mtx_inactive_df = pd.read_csv(tpm_mtx_inactive_path, sep=\"\\t\")\n",
    "# read in covariates \n",
    "covariates_df = pd.read_csv(covariates_path, sep=\"\\t\")\n",
    "# transpose and clean dataframe \n",
    "tpm_mtx_inactive_tp_df = tpm_mtx_inactive_df.set_index(\"gene_id\").transpose()\n",
    "# remove genes with all zeroes\n",
    "tpm_mtx_inactive_tp_rmv_all_zero_df = tpm_mtx_inactive_tp_df.loc[:, (tpm_mtx_inactive_tp_df != 0).any(axis=0)]\n",
    "gene_test_set = tpm_mtx_inactive_tp_rmv_all_zero_df.columns.unique()\n",
    "tpm_mtx_inactive_tp_clean_df = tpm_mtx_inactive_tp_rmv_all_zero_df.reset_index().rename(columns={\"index\":\"rna_id\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "736af44b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of inactive genes: 8779\n"
     ]
    }
   ],
   "source": [
    "# all inactive genes\n",
    "inactive_genes = tpm_mtx_inactive_df.gene_id.unique()\n",
    "print(f\"Total number of inactive genes: {len(inactive_genes)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "81178406",
   "metadata": {},
   "outputs": [],
   "source": [
    "### correlation with covariates \n",
    "\n",
    "# read in covariates \n",
    "covariates_df = pd.read_csv(covariates_path, sep=\"\\t\")\n",
    "\n",
    "bio_covariates_list = ['age_RNA', 'height', 'weight', 'BMI', \"sex_0_1\"]\n",
    "cell_types_covariates_list = ['BA_D_10_9_L___RNA_imptd', 'BA_D_PCT___RNA_imptd', 'BA_N_10_9_L___RNA_imptd', \n",
    "            'BA_N_PCT___RNA_imptd', 'BASO_10_9_L___RNA_imptd', 'BASO_PCT___RNA_imptd', \n",
    "            'Delta_He_pg___RNA_imptd', 'EO_10_9_L___RNA_imptd', 'EO_PCT___RNA_imptd', \n",
    "            'FRC_10_12_L___RNA_imptd', 'FRC_PCT___RNA_imptd', 'H_IPF___RNA_imptd', \n",
    "            'HCT_PCT___RNA_imptd', 'HFLC_10_9_L___RNA_imptd', 'HFLC_PCT___RNA_imptd', \n",
    "            'HFR_PCT___RNA_imptd', 'HGB_g_dL___RNA_imptd', 'HYPER_He_PCT___RNA_imptd', \n",
    "            'HYPO_He_PCT___RNA_imptd', 'IG_10_9_L___RNA_imptd', 'IG_PCT___RNA_imptd', \n",
    "            'IPF___RNA_imptd', 'IPFx_10_9_L___RNA_imptd', 'IRF_PCT___RNA_imptd', \n",
    "            'IRF_Y_ch___RNA_imptd', 'LFR_PCT___RNA_imptd', 'LY_WX___RNA_imptd', \n",
    "            'LY_WY___RNA_imptd', 'LY_WZ___RNA_imptd', 'LY_X_ch___RNA_imptd', \n",
    "            'LY_Y_ch___RNA_imptd', 'LY_Z_ch___RNA_imptd', 'LYMP_10_9_L___RNA_imptd', \n",
    "            'LYMP_PCT___RNA_imptd', 'LYMPH_10_9_L___RNA_imptd', 'LYMPH_PCT___RNA_imptd', \n",
    "            'MacroR_PCT___RNA_imptd', 'MCH_pg___RNA_imptd', 'MCHC_g_dL___RNA_imptd', \n",
    "            'MCV_fL___RNA_imptd', 'MFR_PCT___RNA_imptd', 'MicroR_PCT___RNA_imptd', \n",
    "            'MO_WX___RNA_imptd', 'MO_WY___RNA_imptd', 'MO_WZ___RNA_imptd', 'MO_X_ch___RNA_imptd', \n",
    "            'MO_Y_ch___RNA_imptd', 'MO_Z_ch___RNA_imptd', 'MONO_10_9_L___RNA_imptd', \n",
    "            'MONO_PCT___RNA_imptd', 'MPV_fL___RNA_imptd', 'NE_FSC_ch___RNA_imptd', \n",
    "            'NE_SFL_ch___RNA_imptd', 'NE_SSC_ch___RNA_imptd', 'NE_WX___RNA_imptd', \n",
    "            'NE_WY___RNA_imptd', 'NE_WZ___RNA_imptd', 'NEUT_10_9_L___RNA_imptd', \n",
    "            'NEUT_PCT___RNA_imptd', 'NEUTx_10_9_L___RNA_imptd', 'NEUTx_PCT___RNA_imptd', \n",
    "            'NRBC_10_9_L___RNA_imptd', 'NRBC_PCT___RNA_imptd', 'P_LCR_PCT___RNA_imptd', \n",
    "            'PCT_PCT___RNA_imptd', 'PDW_fL___RNA_imptd', 'PLT_10_9_L___RNA_imptd', \n",
    "            'PLT_F_10_9_L___RNA_imptd', 'PLT_I_10_9_L___RNA_imptd', 'PLT_O_10_9_L___RNA_imptd', \n",
    "            'RBC_10_12_L___RNA_imptd', 'RBC_He_pg___RNA_imptd', 'RBC_O_10_12_L___RNA_imptd', \n",
    "            'RDW_CV_PCT___RNA_imptd', 'RDW_SD_fL___RNA_imptd', 'RET_10_6_uL___RNA_imptd', \n",
    "            'RET_He_pg___RNA_imptd', 'RET_PCT___RNA_imptd', 'RET_RBC_Y_ch___RNA_imptd', \n",
    "            'RET_TNC___RNA_imptd', 'RET_UPP___RNA_imptd', 'RET_Y_ch___RNA_imptd', 'RPI___RNA_imptd', \n",
    "            'TNC_10_9_L___RNA_imptd', 'TNC_D_10_9_L___RNA_imptd', 'TNC_N_10_9_L___RNA_imptd', \n",
    "            'WBC_10_9_L___RNA_imptd', 'WBC_D_10_9_L___RNA_imptd', 'WBC_N_10_9_L___RNA_imptd']\n",
    "            \n",
    "tech_covariates_list = ['Conc_ng_ul', 'OD_260_280','OD_260_230','Yield_ng','Agilent_28S_18S',\n",
    "                        'Agilent_Conc_ng_ul', 'Agilent_Yield_ng',\n",
    "                        'Agilent_RINe_imptd_by_batch','Assigned', 'Unassigned_MultiMapping', \n",
    "                        'Unassigned_NoFeatures', 'Unassigned_Ambiguity','gc_percent_forward_read', \n",
    "                        'gc_percent_reverse_read', 'adapters_percent_forward_read', \n",
    "                        'adapters_percent_reverse_read', 'percent_mapped', 'percent_duplicate', \n",
    "                        'rna_exonic_rate', 'rna_rrna_rate', 'rna_globin_percent_tpm', \n",
    "                        'rna_mitochondrial_percent_tpm', 'num_reads', 'RawReadDepth',\n",
    "                        'RawReadDepth_fromFastQFile']\n",
    "\n",
    "other_covariates = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11','PC12',\n",
    "                    'PC13','PC14','PC15','PC16','PC17','PC18','PC19','PC20','Season_Winter',\n",
    "                    'Season_Autumn','Season_Spring', 'Season_Summer','sequencingBatch_10',\n",
    "                    'sequencingBatch_4','sequencingBatch_15','sequencingBatch_3','sequencingBatch_5',\n",
    "                    'sequencingBatch_8','sequencingBatch_2', 'sequencingBatch_6','sequencingBatch_14',\n",
    "                    'sequencingBatch_11','sequencingBatch_1','sequencingBatch_12','sequencingBatch_9',\n",
    "                    'sequencingBatch_7','sequencingBatch_13',]\n",
    "\n",
    "covariate_list = bio_covariates_list + cell_types_covariates_list + tech_covariates_list + other_covariates\n",
    "covariates_susbet_df = covariates_df[[\"rna_id\"] + covariate_list]\n",
    "tpm_mtx_cov_merged_df = pd.merge(tpm_mtx_inactive_tp_clean_df, \n",
    "                                 covariates_susbet_df, \n",
    "                                 on=\"rna_id\", \n",
    "                                 how=\"inner\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2f6e6739",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of biological covariates: 5\n",
      "Number of Sysmex cell-types: 89\n",
      "Number of technical covariates: 25\n",
      "Number of other covariates: 39\n",
      "Total number of measured covariates: 158\n"
     ]
    }
   ],
   "source": [
    "print(f\"Number of biological covariates: {len(bio_covariates_list)}\")\n",
    "print(f\"Number of Sysmex cell-types: {len(cell_types_covariates_list)}\")\n",
    "print(f\"Number of technical covariates: {len(tech_covariates_list)}\")\n",
    "print(f\"Number of other covariates: {len(other_covariates)}\")\n",
    "print(f\"Total number of measured covariates: {len(covariate_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "625c9e9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# correlate gene expression with covariates \n",
    "cov_gene_id_corr_dict = {}\n",
    "cov_gene_id_pval_dict = {}\n",
    "for gene_id in gene_test_set: \n",
    "    cov_gene_id_corr_dict[gene_id], cov_gene_id_pval_dict[gene_id] = [], []\n",
    "    for covariate in covariate_list:\n",
    "        rho, pval = spearmanr(tpm_mtx_cov_merged_df[gene_id].values, tpm_mtx_cov_merged_df[covariate].values)\n",
    "        cov_gene_id_corr_dict[gene_id].append(rho)\n",
    "        cov_gene_id_pval_dict[gene_id].append(pval)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f53d2859",
   "metadata": {},
   "outputs": [],
   "source": [
    "# rho values \n",
    "cov_gene_id_corr_df = pd.DataFrame.from_dict(cov_gene_id_corr_dict, orient=\"index\", columns=covariate_list)\n",
    "cov_gene_id_corr_clean_df = cov_gene_id_corr_df.reset_index().rename(columns={\"index\":\"gene_id\"})\n",
    "# write to file \n",
    "cov_gene_id_corr = out_dir_path.joinpath(\"misexp_corr_cov.csv\")\n",
    "cov_gene_id_corr_clean_df.to_csv(cov_gene_id_corr, index=False)\n",
    "# flatten \n",
    "cov_gene_id_corr_flat_df = pd.melt(cov_gene_id_corr_clean_df, id_vars=\"gene_id\").rename(columns={\"value\":\"spearman_rho\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "115f7216",
   "metadata": {},
   "outputs": [],
   "source": [
    "# p-values \n",
    "cov_gene_id_pval_df = pd.DataFrame.from_dict(cov_gene_id_pval_dict, orient=\"index\", columns=covariate_list)\n",
    "cov_gene_id_pval_clean_df = cov_gene_id_pval_df.reset_index().rename(columns={\"index\":\"gene_id\"})\n",
    "# write to file \n",
    "cov_gene_id_pval = out_dir_path.joinpath(\"misexp_pval_cov.csv\")\n",
    "cov_gene_id_pval_clean_df.to_csv(cov_gene_id_pval, index=False)\n",
    "# flatten \n",
    "cov_gene_id_pval_flat_df = pd.melt(cov_gene_id_pval_clean_df, id_vars=\"gene_id\").rename(columns={\"value\":\"pval\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "6d4ab9ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of xCell features: 67\n",
      "Total number of covariates: 225\n"
     ]
    }
   ],
   "source": [
    "# correlation with inferred cell enrichments (xCell)\n",
    "xcell_enrich_df = pd.read_csv(xcell_enrich_path, sep=\"\\t\")\n",
    "\n",
    "xcell_features = xcell_enrich_df.columns.tolist()\n",
    "print(f\"Number of xCell features: {len(xcell_features)}\")\n",
    "xcell_enrich_reidx_df = xcell_enrich_df.reset_index().rename(columns={\"index\":\"rna_id\"})\n",
    "\n",
    "tpm_mtx_xcell_merged_df = pd.merge(tpm_mtx_inactive_tp_clean_df, \n",
    "                               xcell_enrich_reidx_df, \n",
    "                               on=\"rna_id\", \n",
    "                               how=\"inner\")\n",
    "print(f\"Total number of covariates: {len(xcell_features) + len(covariate_list)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4f098666",
   "metadata": {},
   "outputs": [],
   "source": [
    "xcell_gene_id_corr_dict = {}\n",
    "xcell_gene_id_pval_dict = {}\n",
    "for i, gene_id in enumerate(gene_test_set):\n",
    "    rho_list, pval_list = [gene_id], [gene_id]\n",
    "    for cell in xcell_features: \n",
    "        rho, pval = spearmanr(tpm_mtx_xcell_merged_df[gene_id].values, tpm_mtx_xcell_merged_df[cell].values)\n",
    "        rho_list.append(rho)\n",
    "        pval_list.append(pval) \n",
    "    xcell_gene_id_corr_dict[i] = rho_list\n",
    "    xcell_gene_id_pval_dict[i] = pval_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a894e7a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# rho values \n",
    "xcell_gene_id_corr_rho_df = pd.DataFrame.from_dict(xcell_gene_id_corr_dict, orient=\"index\", columns=[\"gene_id\"] + xcell_features)\n",
    "# write to file \n",
    "xcell_gene_id_corr = out_dir_path.joinpath(\"misexp_corr_xcell.csv\")\n",
    "xcell_gene_id_corr_rho_df.to_csv(xcell_gene_id_corr, index=False)\n",
    "# flatten \n",
    "xcell_gene_id_corr_rho_flat_df = pd.melt(xcell_gene_id_corr_rho_df, id_vars=\"gene_id\").rename(columns={\"value\":\"spearman_rho\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "1bf61c59",
   "metadata": {},
   "outputs": [],
   "source": [
    "# p-values \n",
    "xcell_gene_id_corr_p_df = pd.DataFrame.from_dict(xcell_gene_id_pval_dict, orient=\"index\", columns=[\"gene_id\"] + xcell_features)\n",
    "# write to file \n",
    "xcell_gene_id_pval = out_dir_path.joinpath(\"misexp_pval_xcell.csv\")\n",
    "xcell_gene_id_corr_p_df.to_csv(xcell_gene_id_pval, index=False)\n",
    "# flatten \n",
    "xcell_gene_id_corr_p_flat_df = pd.melt(xcell_gene_id_corr_p_df, id_vars=\"gene_id\").rename(columns={\"value\":\"pval\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "370f2d5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "### remove later - added to not rerun above \n",
    "cov_gene_id_corr = out_dir_path.joinpath(\"misexp_corr_cov.csv\")\n",
    "cov_gene_id_corr_clean_df = pd.read_csv(cov_gene_id_corr)\n",
    "\n",
    "cov_gene_id_pval = out_dir_path.joinpath(\"misexp_pval_cov.csv\")\n",
    "cov_gene_id_pval_clean_df = pd.read_csv(cov_gene_id_pval)\n",
    "\n",
    "xcell_gene_id_corr = out_dir_path.joinpath(\"misexp_corr_xcell.csv\")\n",
    "xcell_gene_id_corr_rho_df = pd.read_csv(xcell_gene_id_corr)\n",
    "\n",
    "xcell_gene_id_pval = out_dir_path.joinpath(\"misexp_pval_xcell.csv\")\n",
    "xcell_gene_id_corr_p_df = pd.read_csv(xcell_gene_id_pval)\n",
    "\n",
    "# flatten \n",
    "cov_gene_id_corr_flat_df = pd.melt(cov_gene_id_corr_clean_df, id_vars=\"gene_id\").rename(columns={\"value\":\"spearman_rho\"})\n",
    "cov_gene_id_pval_flat_df = pd.melt(cov_gene_id_pval_clean_df, id_vars=\"gene_id\").rename(columns={\"value\":\"pval\"})\n",
    "xcell_gene_id_corr_rho_flat_df = pd.melt(xcell_gene_id_corr_rho_df, id_vars=\"gene_id\").rename(columns={\"value\":\"spearman_rho\"})\n",
    "xcell_gene_id_corr_p_flat_df = pd.melt(xcell_gene_id_corr_p_df, id_vars=\"gene_id\").rename(columns={\"value\":\"pval\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b776c83d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# combine all p-values\n",
    "all_cov_gene_id_pval_df = pd.concat([cov_gene_id_pval_flat_df, xcell_gene_id_corr_p_flat_df])\n",
    "# multiple testing correction \n",
    "pval_as_array = all_cov_gene_id_pval_df.pval.to_numpy()\n",
    "pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method=\"fdr_bh\")\n",
    "all_cov_gene_id_pval_df[\"pass\"] = pass_test\n",
    "all_cov_gene_id_pval_df[\"pval_adj\"] = pval_adj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "da2a4421",
   "metadata": {},
   "outputs": [],
   "source": [
    "# multiple testing correction across all p-values \n",
    "# only remove genes with high correlation and p-value passing threshold "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "55f81a94",
   "metadata": {},
   "outputs": [],
   "source": [
    "# combine correlations \n",
    "all_cov_gene_id_corr_df = pd.concat([xcell_gene_id_corr_rho_flat_df, cov_gene_id_corr_flat_df])\n",
    "# add p-values \n",
    "all_cov_gene_id_corr_p_df = pd.merge(all_cov_gene_id_corr_df, \n",
    "                                     all_cov_gene_id_pval_df, \n",
    "                                     on=[\"gene_id\", \"variable\"], \n",
    "                                     how=\"inner\"\n",
    "                                    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9348bef9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes to remove: 129\n"
     ]
    }
   ],
   "source": [
    "# identify correlated genes \n",
    "genes_to_remove = all_cov_gene_id_corr_p_df[(all_cov_gene_id_corr_p_df.spearman_rho.abs() > spearman_rho_cutoff) & \n",
    "                                            (all_cov_gene_id_corr_p_df[\"pass\"])\n",
    "                                           ].gene_id.unique()\n",
    "print(f\"Number of genes to remove: {len(genes_to_remove)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "2d7ec8a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write genes to file \n",
    "genes_corr_tech_covs_path = out_dir_path.joinpath(\"genes_corr_tech_covs.txt\")\n",
    "with open(genes_corr_tech_covs_path, 'w') as f_out: \n",
    "    for gene_id in genes_to_remove: \n",
    "        f_out.write(f\"{gene_id}\\n\")\n",
    "# write all covariates to file \n",
    "gene_id_corr_all_pval = out_dir_path.joinpath(\"misexp_corr_corr_all_covs.csv\")\n",
    "all_cov_gene_id_corr_df.to_csv(gene_id_corr_all_pval, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "a265bb0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read in flat z-score matrix \n",
    "zscore_tpm_flat_df = pd.read_csv(zscore_tpm_flat_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "41e39ef2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of genes: 8779\n",
      "Number of genes remaining: 8650\n",
      "Percentage of genes removed: 1.4694156509853058\n"
     ]
    }
   ],
   "source": [
    "# metrics - percentage of genes removed, total inactive genes = 8,780\n",
    "number_smpls = len(zscore_tpm_flat_df.rna_id.unique())\n",
    "number_genes = set(inactive_genes)\n",
    "print(f\"Total number of genes: {len(number_genes)}\")\n",
    "genes_remaining = number_genes - set(genes_to_remove)\n",
    "number_genes_remaining = len(genes_remaining)\n",
    "print(f\"Number of genes remaining: {number_genes_remaining}\")\n",
    "print(f\"Percentage of genes removed: {(len(genes_to_remove)/len(number_genes))*100}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "118cb3f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "genes_remaining_path = out_dir_path.joinpath(f\"gene_id_post_tech_cov_qc_{number_genes_remaining}.txt\")\n",
    "with open(genes_remaining_path, 'w') as f_out: \n",
    "    for gene_id in genes_remaining: \n",
    "        f_out.write(f\"{gene_id}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "8ae10bdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove genes correlated with technical covariates \n",
    "zscore_tpm_flat_rmvd_genes_df = zscore_tpm_flat_df[~zscore_tpm_flat_df.gene_id.isin(genes_to_remove)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "1745680c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of genes remaining in matrix: 8610\n"
     ]
    }
   ],
   "source": [
    "gene_id_z_tpm_flat_remaining = zscore_tpm_flat_rmvd_genes_df.gene_id.unique()\n",
    "num_gene_id_z_tpm_flat_remaining = len(gene_id_z_tpm_flat_remaining)\n",
    "print(f\"Number of genes remaining in matrix: {num_gene_id_z_tpm_flat_remaining}\")\n",
    "# this number differs to the one above as we removed 40 genes with TPM=0 across all \n",
    "# samples to calculate z-scores "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "ece356b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write remaining genes to file \n",
    "xcell_gene_id_corr_pval = out_dir_path.joinpath(f\"tpm_zscore_{number_smpls}smpls_{num_gene_id_z_tpm_flat_remaining}genes_flat_misexp_corr_qc.csv\")\n",
    "zscore_tpm_flat_rmvd_genes_df.to_csv(xcell_gene_id_corr_pval, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5e3923e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
